{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "out = 'xlnet-base-bahasa-cased'\n",
    "os.makedirs(out, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import XLNetTokenizer, XLNetModel, XLNetConfig, AutoTokenizer, AutoModelWithLMHead, pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('xlnet-base-bahasa-cased/spiece.model',\n",
       " 'xlnet-base-bahasa-cased/special_tokens_map.json',\n",
       " 'xlnet-base-bahasa-cased/added_tokens.json')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer = XLNetTokenizer('sp10m.cased.v9.model', do_lower_case = False)\n",
    "tokenizer.save_pretrained('xlnet-base-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = XLNetTokenizer.from_pretrained('./xlnet-base-bahasa-cased', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli convert --model_type xlnet \\\n",
    "#   --tf_checkpoint output-model/model.ckpt-300000 \\\n",
    "#   --config output-model/config.json \\\n",
    "#   --pytorch_dump_output xlnet-base-bahasa-cased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "directory = 'xlnet-base-bahasa-cased'\n",
    "config = XLNetConfig(f'{directory}/config.json')\n",
    "config.vocab_size = 32000\n",
    "config.d_inner = 3072\n",
    "config.d_model = 768\n",
    "config.n_head = 12\n",
    "config.n_layer = 12"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "XLNetConfig {\n",
       "  \"architectures\": null,\n",
       "  \"attn_type\": \"bi\",\n",
       "  \"bi_data\": false,\n",
       "  \"bos_token_id\": null,\n",
       "  \"clamp_len\": -1,\n",
       "  \"d_head\": 64,\n",
       "  \"d_inner\": 3072,\n",
       "  \"d_model\": 768,\n",
       "  \"do_sample\": false,\n",
       "  \"dropout\": 0.1,\n",
       "  \"end_n_top\": 5,\n",
       "  \"eos_token_ids\": null,\n",
       "  \"ff_activation\": \"gelu\",\n",
       "  \"finetuning_task\": null,\n",
       "  \"id2label\": {\n",
       "    \"0\": \"LABEL_0\",\n",
       "    \"1\": \"LABEL_1\"\n",
       "  },\n",
       "  \"initializer_range\": 0.02,\n",
       "  \"is_decoder\": false,\n",
       "  \"label2id\": {\n",
       "    \"LABEL_0\": 0,\n",
       "    \"LABEL_1\": 1\n",
       "  },\n",
       "  \"layer_norm_eps\": 1e-12,\n",
       "  \"length_penalty\": 1.0,\n",
       "  \"max_length\": 20,\n",
       "  \"mem_len\": null,\n",
       "  \"model_type\": \"xlnet\",\n",
       "  \"n_head\": 12,\n",
       "  \"n_layer\": 12,\n",
       "  \"num_beams\": 1,\n",
       "  \"num_labels\": 2,\n",
       "  \"num_return_sequences\": 1,\n",
       "  \"output_attentions\": false,\n",
       "  \"output_hidden_states\": false,\n",
       "  \"output_past\": true,\n",
       "  \"pad_token_id\": null,\n",
       "  \"pruned_heads\": {},\n",
       "  \"repetition_penalty\": 1.0,\n",
       "  \"reuse_len\": null,\n",
       "  \"same_length\": false,\n",
       "  \"start_n_top\": 5,\n",
       "  \"summary_activation\": \"tanh\",\n",
       "  \"summary_last_dropout\": 0.1,\n",
       "  \"summary_type\": \"last\",\n",
       "  \"summary_use_proj\": true,\n",
       "  \"temperature\": 1.0,\n",
       "  \"top_k\": 50,\n",
       "  \"top_p\": 1.0,\n",
       "  \"torchscript\": false,\n",
       "  \"untie_r\": true,\n",
       "  \"use_bfloat16\": false,\n",
       "  \"vocab_size\": 32000\n",
       "}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('./xlnet-base-bahasa-cased/pytorch_model.bin', config = config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': 'makan ayam dengan sejenis<sep><cls>',\n",
       "  'score': 0.08394942432641983,\n",
       "  'token': 3038},\n",
       " {'sequence': 'makan ayam dengan dri<sep><cls>',\n",
       "  'score': 0.04811568558216095,\n",
       "  'token': 2777},\n",
       " {'sequence': 'makan ayam dengan kuy<sep><cls>',\n",
       "  'score': 0.023890310898423195,\n",
       "  'token': 7877},\n",
       " {'sequence': 'makan ayam dengan semestinya<sep><cls>',\n",
       "  'score': 0.023234495893120766,\n",
       "  'token': 6488},\n",
       " {'sequence': 'makan ayam dengan basically<sep><cls>',\n",
       "  'score': 0.022633543238043785,\n",
       "  'token': 22441}]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fill_mask('makan ayam dengan <mask>')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.save_pretrained('xlnet-base-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !transformers-cli upload ./xlnet-base-bahasa-cased"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "43349134146b4023a98e35f363f2b8b5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467039973.0, style=ProgressStyle(descri…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "model = XLNetModel.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = XLNetTokenizer.from_pretrained('huseinzol05/xlnet-base-bahasa-cased', do_lower_case = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_ids = torch.tensor([tokenizer.encode(\"husein tk suka mkan ayam\", add_special_tokens=True)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[[-1.2767,  1.1698,  0.1059,  ..., -3.4251, -1.1081, -1.8970],\n",
       "         [ 4.7746,  1.6299, -2.1562,  ..., -1.9677, -0.8094, -0.2375],\n",
       "         [ 4.6481, -1.1669, -3.3487,  ..., -1.6533, -2.2662,  2.5585],\n",
       "         ...,\n",
       "         [-1.8671,  2.4500, -1.3904,  ..., -1.0455, -0.4507, -1.0828],\n",
       "         [ 3.3637,  1.3574, -2.3841,  ..., -1.8289,  7.4378, -1.6053],\n",
       "         [ 2.4336,  2.5622, -4.5100,  ..., -1.1985, -2.3939,  2.5001]]])"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with torch.no_grad():\n",
    "    last_hidden_states = model(input_ids)[0]\n",
    "    \n",
    "last_hidden_states\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'sequence': 'makan ayam dengan sejenis<sep><cls>',\n",
       "  'score': 0.08394942432641983,\n",
       "  'token': 3038},\n",
       " {'sequence': 'makan ayam dengan dri<sep><cls>',\n",
       "  'score': 0.04811568558216095,\n",
       "  'token': 2777},\n",
       " {'sequence': 'makan ayam dengan kuy<sep><cls>',\n",
       "  'score': 0.023890310898423195,\n",
       "  'token': 7877},\n",
       " {'sequence': 'makan ayam dengan semestinya<sep><cls>',\n",
       "  'score': 0.023234495893120766,\n",
       "  'token': 6488},\n",
       " {'sequence': 'makan ayam dengan basically<sep><cls>',\n",
       "  'score': 0.022633543238043785,\n",
       "  'token': 22441}]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = AutoModelWithLMHead.from_pretrained('huseinzol05/xlnet-base-bahasa-cased')\n",
    "fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)\n",
    "fill_mask('makan ayam dengan <mask>')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
